*--------------------------------------------------------------; * Selects a two-stage cluster sample, in which clusters are ; * selected (with replacement) with probability proportional ; * to size of cluster. A second independent sample is seleced ; * from the complete cluster if a cluster is selected more ; * than once at the first stage. ; *--------------------------------------------------------------; %macro cl2pps(noprint,frame=,cluster=,setup=,npop=, n=,mi=,mpopi=,sample=,rep=,seed=); %if %length(&sample) = 0 %then %let sample = %str(sample); %if %length(&frame) = 0 %then %let frame = %str(frame); %if %length(&cluster) = 0 %then %let cluster =cluster; %if %length(&setup) = 0 %then %let setup = %str(setup); %if %length(&n) = 0 %then %let n = %str(n); %if %length(&mpopi) = 0 %then %let mpopi = %str(mpopi); %if %length(&mi) = 0 %then %let mi = %str(mi); %if %length(&rep) = 0 %then %let rep = %str(rep); %if %length(&seed) = 0 %then %let seed = %str(0); proc sort data = &frame; by &cluster; proc means data = &frame noprint; by &cluster; output out = weights_ n = mi_; data weights_; set weights_ end = eof; cum_ + mi_; if eof = 1 then do; call symput('tot_',trim(left(cum_))); end; drop _type_ _freq_; run; data weights_; set weights_; u_ = cum_/&tot_; data unifs_(keep = u_); do i_ = 1 to &n; u_ = ranuni(&seed); output; end; proc sort data = unifs_; by u_; data outcl_; set weights_ unifs_; by u_; retain count_ 0; flag_ = 1; if cum_ = . then count_ = count_ + 1; else do; if count_ > 0 then do; do i_ = 1 to count_; output; end; count_ = 0; end; end; drop cum_ u_; proc sort data = outcl_; by &cluster; proc sort data = &setup; by &cluster; data outcl_; merge &setup outcl_; by &cluster; if flag_ = 1 then output; data nclusts_(keep = count_ &cluster) ntry_; set outcl_; by &cluster; if first.&cluster then output nclusts_; cnt_ + 1; output ntry_; proc sort data = outcl_; by &cluster count_; data stage1_; merge outcl_ &frame; by &cluster; if flag_ = 1 then do; &rep = 0; do i_ = 1 to count_; &rep = &rep + 1; output; end; end; proc sort data = stage1_; by &cluster &rep; data stage1_; set stage1_; by &cluster; retain help_; sampold_ = lag(&rep); clold_ = lag(&cluster); if _n_ = 1 then do; help_ = 1; end; else do; if sampold_ ne &rep or clold_ ne &cluster then do; help_ + 1; end; end; drop sampold_ clold_; proc sort data = stage1_; by help_; data &sample; set stage1_; by help_; retain nprime_ 0 pprime_ 1 rprime_ 0 u_ 0; if first.help_ then do; nprime_ = &mpopi; rprime_ = &mpopi - &mi; u_ = ranuni(0); pprime_ = pprime_*rprime_/nprime_; end; if pprime_ > u_ then do; rprime_ = rprime_ - 1; nprime_ = nprime_ - 1; if nprime_ > 0 then pprime_ = pprime_*rprime_/nprime_; end; else do; output; nprime_ = nprime_ - 1; pprime_ = 1; u_ = ranuni(0); if nprime_ > 0 then pprime_ = pprime_*rprime_/nprime_; end; drop nprime_ pprime_ rprime_ u_ flag_ count_ i_ help_ mi_ &mi &mpopi; %if %length(&noprint) = 0 %then %do; proc print data = nclusts_ noobs split = '*'; title1 'Two-stage Cluster Sample'; title2 'Clusters Selected at First Stage'; title3 'With Probability Proportional to Size'; label count_ = 'Replications'; var &cluster count_; proc print data = &sample; title1 'Two-stage Cluster Sample'; title2 'Clusters Selected with Probability Proportional to Size'; title3 "Output Data Set = &sample"; title4 "Replication variable = &rep"; %end; run; title; %mend cl2pps;